In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
Can you plot an histogram of word frequencies for the data/aristotle.txt file?
In [11]:
plt.figure(figsize=(18, 7))
words = {}
for line in open('../data/aristotle.txt'):
for word in line.rstrip().split():
words[word] = words.get(word, 0)
words[word] += 1
plt.bar(range(len(words)),
sorted(words.values()))
plt.xlabel('word')
plt.xlabel('occurrences');
Can you investigate the relationships between the variables of the data/abundance.tsv file? Which plot type is best for this task?
Can you investigate the relationships between the variables of the data/abundance.tsv file in a single figure, using subplots?
In [12]:
import pandas as pd
In [14]:
ab = pd.read_table('../data/abundance.tsv')
In [15]:
ab.head()
Out[15]:
In [48]:
plt.figure(figsize=(12, 18))
plt.subplot(321)
plt.plot(ab['length'],
ab['eff_length'],
'k.')
plt.xlabel('length')
plt.ylabel('eff_length')
plt.subplot(322)
plt.plot(ab['length'],
ab['est_counts'],
'k.')
plt.xlabel('length')
plt.ylabel('est_counts')
plt.subplot(323)
plt.plot(ab['length'],
ab['tpm'],
'k.')
plt.xlabel('length')
plt.ylabel('tpm')
plt.subplot(324)
plt.plot(ab['eff_length'],
ab['est_counts'],
'k.')
plt.xlabel('eff_length')
plt.ylabel('est_counts')
plt.subplot(325)
plt.plot(ab['eff_length'],
ab['tpm'],
'k.')
plt.xlabel('eff_length')
plt.ylabel('tpm')
plt.subplot(326)
plt.plot(ab['est_counts'],
ab['tpm'],
'k.')
plt.xlabel('est_counts')
plt.ylabel('tpm');
In [24]:
import seaborn as sns
In [27]:
# much easier with seaborn
sns.pairplot(ab.set_index('target_id'));
Can you investigate the relationships between the variables of the data/abundance.tsv file in a single plot? You might want to use different colors...
In [30]:
plt.figure(figsize=(10, 10))
plt.plot(ab['length'],
ab['eff_length'],
'.',
label='eff_length')
plt.plot(ab['length'],
ab['est_counts'],
'.',
label='eff_length')
plt.plot(ab['length'],
ab['tpm'],
'.',
label='tpm')
plt.legend(loc='best')
plt.xlabel('length')
plt.ylabel('other variable');
Can you plot the relationship between word length and number of vowels in the data/unixdict.txt file?
In [40]:
vowels = set('aeiouy')
dictionary1 = {}
# key: word
# value: length of the word
dictionary2 = {}
# key: word
# value: number of vowels
for line in open('../data/unixdict.txt'):
word = line.rstrip()
dictionary1[word] = len(word)
dictionary2[word] = len(set(word).intersection(vowels))
In [43]:
plt.figure(figsize=(7, 7))
plt.plot(dictionary1.values(),
dictionary2.values(),
'ko')
plt.xlabel('word length')
plt.ylabel('number of vowels');
Can you plot three variables (with very different scales) in a single plot? You can google to look for an answer...
In [45]:
plt.figure(figsize=(7, 7))
plt.plot(ab['length'],
ab['eff_length'],
'k.')
plt.xlabel('length')
plt.ylabel('eff_length')
ax = plt.twinx()
ax.plot(ab['length'],
ab['tpm'],
'r.')
ax.set_ylabel('tpm');
Can you figure out how to make boxplots out of one of the variables of the data/abundance.tsv file?
In [55]:
plt.figure(figsize=(2, 7))
plt.boxplot(ab['length'])
# restrict the range of the plot
plt.ylim(0, 10000)
Out[55]: